library(tidyverse)
library(magrittr)
library(parallel)
library(here)
library(AnnotationHub)
library(purrr)
library(scales)
library(kableExtra)
library(edgeR)
library(ggrepel)
library(fgsea)
library(RColorBrewer)
library(pheatmap)
library(cqn)
library(ggpubr)
library(DT)
library(htmltools)
library(pander)
if (interactive()) setwd(here::here())
theme_set(theme_bw())
cores <- detectCores() - 1
source(here("R/lbFuncs.R"))
ah <- AnnotationHub() %>%
subset(species == "Danio rerio") %>%
subset(rdataclass == "EnsDb")
ensDb <- ah[["AH83189"]] ## Ens101
transcripts <- transcripts(ensDb)
txLen <- exonsBy(ensDb, "tx") %>%
width() %>%
vapply(sum, integer(1))
mcols(transcripts) <- mcols(transcripts)[
c("tx_id", "gene_id", "gc_content")
] %>%
as.data.frame() %>%
mutate(length = txLen)
geneGc <- transcripts %>%
mcols() %>%
as_tibble() %>%
group_by(gene_id) %>%
summarise(
gc_content = sum(gc_content*length) / sum(length),
length = ceiling(median(length))
)
genes <- genes(ensDb)
mcols(genes) <- mcols(genes)[
c("gene_id", "gene_name", "gene_biotype", "entrezid")
] %>%
as.data.frame() %>%
left_join(geneGc)
An EnsDb object was obtained for Ensembl release 101 with the AnnotationHub package. This contained the information required to set up gene and transcript annotations. Gene-level estimates of GC content and length were also generated which are required for assessing potential biases in downstream analysis.
metadata <- read_csv(here("files/samples.csv")) %>%
dplyr::select(-sample) %>%
dplyr::rename(sample = basename, genotype = Genotype) %>%
## We need some sample aliases that follow R naming conventions
mutate(
alias = c(
paste0(rep("fAD", 7), seq(1, 7)),
paste0(rep("fAI", 8), seq(1, 8)),
paste0(rep("wt", 9), seq(1, 9))
),
group = genotype
)
metadata$genotype <- fct_relevel(
metadata$genotype,
c("WT", "EOfAD-like/+", "fAI-like/+")
)
genoCols <- metadata$genotype %>%
levels() %>%
length() %>%
brewer.pal("Set1") %>%
setNames(levels(metadata$genotype))
counts <- read_tsv(here("03_align/featureCounts/genes.out")) %>%
set_colnames(basename(colnames(.))) %>%
set_colnames(str_remove(colnames(.), "Aligned.sortedByCoord.out.bam"))
minLib <- counts %>%
dplyr::select(., str_subset(colnames(.), "KB")) %>%
colSums() %>%
min()
minCPM <- as.vector(cpm(10, lib.size = minLib))
minSamples <- metadata %>%
split(.$genotype) %>%
lapply(nrow) %>%
unlist() %>%
min()
dgeList <- counts %>%
as.data.frame() %>%
column_to_rownames("Geneid") %>%
.[rowSums(cpm(.) >= minCPM) >= minSamples,] %>%
DGEList(
samples = tibble(sample = colnames(.)) %>%
left_join(metadata),
genes = genes[rownames(.),] %>%
as_tibble() %>%
dplyr::select(chromosome = seqnames, everything())
) %>%
calcNormFactors()
Gene-level count data was loaded from featureCounts and imported into R as a DGEList object. Genes were removed from the counts matrix if they had less than 0.22 CPM (~10 reads) in more than 17 samples. This meant that each remaining gene must have assigned reads detected in at least one genotype group.
21147 genes remained for further analysis with library sizes ranging from 46,298,106 to 71,535,868.
cpm(dgeList, log = TRUE) %>%
as.data.frame() %>%
pivot_longer(
cols = everything(),
names_to = "sample",
values_to = "logCPM"
) %>%
left_join(metadata) %>%
ggplot(aes(logCPM, colour = genotype, group = sample)) +
geom_density() +
scale_colour_manual(values = genoCols) +
labs(
x = "logCPM",
y = "Density",
colour = "Genotype"
)
Expression density plots for all samples after filtering
pca <- dgeList %>%
cpm(log = TRUE) %>%
t() %>%
prcomp()
pcaVars <- percent_format(0.1)(summary(pca)$importance["Proportion of Variance",])
pca$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
left_join(metadata) %>%
as_tibble() %>%
ggplot(aes(PC1, PC2, colour = genotype, fill = genotype, label = alias)) +
geom_point(size = 2) +
geom_text_repel(show.legend = FALSE) +
stat_ellipse(geom = "polygon", alpha = 0.05, show.legend = FALSE) +
guides(fill = FALSE) +
scale_colour_manual(values = genoCols) +
labs(
x = paste0("PC1 (", pcaVars[["PC1"]], ")"),
y = paste0("PC2 (", pcaVars[["PC2"]], ")"),
colour = "Genotype"
)
Principal Component Analysis of gene-level counts
A Principal Component Analysis (PCA) was performed using the sample logCPM values. It appears there is no clustering of genotypes, which may be due to the subtlty of the mutations and early time point (6 months). Additionally, PC1 only accounts for 12.1% of the variance.
The design matrix was defined containing an intercept representing baseline expression in wildtype samples. The remaining two columns represent the presence of either the EofAD or fAI mutant alleles.
design <- model.matrix(~genotype, data = dgeList$samples) %>%
set_colnames(str_remove(colnames(.), "genotype"))
pheatmap(
design,
cluster_cols = FALSE,
cluster_rows = FALSE,
color = c("white", "grey50"),
annotation_row = dgeList$samples["genotype"],
annotation_colors = list(genotype = genoCols),
angle_col = "315",
legend = FALSE
)
Visualisation of the design matrix
Genewise negative binomial generalized linear models were using the glmFit() function of the edgeR package. Likelihood ratio tests were then performed for the coefficients in the model with glmLRT(). Here the null hypothesis, \(H_0\), is testing that the true values of the coefficients are equal to 0. Due to the subtle effects of the mutation outlined in the PCA, genes were considered to be DE if they had an FDR-adjusted p-value < 0.05, as opposed to more stringent error rate controls such as the Bonferroni method.
alpha <- 0.05
minLfc <- 1
fit <- estimateDisp(dgeList, design) %>%
glmFit()
topTables <- colnames(design)[2:3] %>%
sapply(function(x){
glmLRT(fit, coef = x) %>%
topTags(n = Inf) %>%
.[["table"]] %>%
as_tibble() %>%
arrange(PValue) %>%
dplyr::select(
gene_id, gene_name, logFC, logCPM, PValue, FDR, everything()
) %>%
mutate(
coef = x,
bonfP = p.adjust(PValue, "bonf"),
DE = ifelse(FDR < 0.05, TRUE, FALSE)
)
}, simplify = FALSE)
With this criteria, the following genes were classified as DE:
topTables$`EOfAD-like/+` %>%
dplyr::filter(DE) %>%
dplyr::select(gene_id, gene_name, logFC, logCPM, PValue, FDR, bonfP) %>%
mutate(
logCPM = formatC(logCPM, digits = 2, format = "f"),
logFC = formatC(logFC, digits = 2, format = "f"),
PValue = formatC(PValue, digits = 2, format = "e"),
FDR = formatC(FDR, digits = 2, format = "e"),
bonfP = formatC(bonfP, digits = 2, format = "e"),
) %>%
kable(
align = "l",
caption = paste(
"The", nrow(.), "differentially expressed genes for EOfAD genotype",
"in comparison to wildtype"
)
) %>%
kable_styling(
bootstrap_options = c("striped", "hover", "condensed", "responsive")
)
| gene_id | gene_name | logFC | logCPM | PValue | FDR | bonfP |
|---|---|---|---|---|---|---|
| ENSDARG00000078322 | col12a1a | 1.87 | 3.83 | 6.26e-21 | 1.32e-16 | 1.32e-16 |
| ENSDARG00000093378 | si:ch211-235i11.5 | 0.63 | 2.96 | 3.36e-11 | 3.55e-07 | 7.10e-07 |
| ENSDARG00000079324 | ano9b | 1.38 | -1.19 | 3.16e-08 | 2.22e-04 | 6.67e-04 |
| ENSDARG00000020448 | adi1 | 0.54 | 2.81 | 3.15e-07 | 1.67e-03 | 6.67e-03 |
| ENSDARG00000092191 | CR318588.1 | 3.32 | 2.45 | 1.25e-06 | 4.99e-03 | 2.65e-02 |
| ENSDARG00000001993 | myhb | 4.04 | 2.38 | 1.41e-06 | 4.99e-03 | 2.99e-02 |
| ENSDARG00000088298 | si:ch211-235i11.4 | 0.34 | 3.57 | 2.44e-06 | 7.36e-03 | 5.15e-02 |
| ENSDARG00000094719 | CR318588.3 | 2.05 | 2.71 | 4.64e-06 | 1.23e-02 | 9.81e-02 |
| ENSDARG00000056065 | mov10b.2 | 1.38 | -1.77 | 1.68e-05 | 3.95e-02 | 3.56e-01 |
| ENSDARG00000088507 | znf982 | 0.39 | 2.28 | 2.31e-05 | 4.89e-02 | 4.89e-01 |
topTables$`fAI-like/+` %>%
dplyr::filter(DE) %>%
dplyr::select(gene_id, gene_name, logFC, logCPM, PValue, FDR, bonfP) %>%
mutate(
logCPM = formatC(logCPM, digits = 2, format = "f"),
logFC = formatC(logFC, digits = 2, format = "f"),
PValue = formatC(PValue, digits = 2, format = "e"),
FDR = formatC(FDR, digits = 2, format = "e"),
bonfP = formatC(bonfP, digits = 2, format = "e"),
) %>%
kable(
align = "l",
caption = paste(
"The", nrow(.), "differentially expressed genes for fAI genotype",
"in comparison to wildtype"
)
) %>%
kable_styling(
bootstrap_options = c("striped", "hover", "condensed", "responsive")
)
| gene_id | gene_name | logFC | logCPM | PValue | FDR | bonfP |
|---|---|---|---|---|---|---|
| ENSDARG00000004870 | psen1 | -0.78 | 5.33 | 3.84e-75 | 8.13e-71 | 8.13e-71 |
| ENSDARG00000005179 | dglucy | -0.96 | 5.01 | 3.70e-09 | 3.91e-05 | 7.82e-05 |
| ENSDARG00000061758 | sh3pxd2ab | -3.04 | 1.06 | 3.02e-08 | 2.13e-04 | 6.39e-04 |
| ENSDARG00000067665 | fam167aa | -3.39 | 0.33 | 7.62e-08 | 4.03e-04 | 1.61e-03 |
| ENSDARG00000105453 | kcnk17 | -1.52 | -1.00 | 7.63e-07 | 3.23e-03 | 1.61e-02 |
| ENSDARG00000094098 | si:ch211-278p9.3 | -2.29 | -2.35 | 1.22e-05 | 4.28e-02 | 2.57e-01 |
deCols <- c(`TRUE` = "red", `FALSE` = "grey50")
topTables %>%
bind_rows() %>%
arrange(DE) %>%
ggplot(aes(logCPM, logFC)) +
geom_point(aes(colour = DE), alpha = 0.4) +
geom_text_repel(
aes(label = gene_name, colour = DE),
data = . %>% dplyr::filter(DE)
) +
geom_smooth(se = FALSE) +
facet_wrap(~coef, nrow = 2) +
scale_y_continuous(breaks = seq(-8, 8, by = 2)) +
scale_colour_manual(values = deCols) +
theme(legend.position = "none")
MA plots checking for logFC artefacts across the range of expression values. The average logFC appears consistant across all expression values for both comparisons
topTables %>%
bind_rows() %>%
mutate(stat = -sign(logFC)*log10(PValue)) %>%
ggplot(aes(gc_content, stat)) +
geom_point(aes(colour = DE), alpha = 0.4) +
geom_smooth(se = FALSE) +
facet_wrap(~coef) +
geom_text_repel(
aes(label = gene_name, colour = DE),
data = . %>% dplyr::filter(DE)
) +
labs(
x = " Gene GC content (%)",
y = "Ranking Statistic"
) +
coord_cartesian(ylim = c(-10, 10)) +
scale_colour_manual(values = deCols) +
theme(legend.position = "none")
GC content bias for differential expression. GC content is shown against the ranking statistic, calculated by -log10(p) multiple by the sign of logFC. Some bias was observed for both comparisons. In particular, the fAI genotype shows a well-documented pattern of GC bias, whereby both low and high GC proportions are affected. This may be of concern for this dataset.
topTables %>%
bind_rows() %>%
mutate(stat = -sign(logFC)*log10(PValue)) %>%
ggplot(aes(length, stat)) +
geom_point(aes(colour = DE), alpha = 0.4) +
geom_smooth(se = FALSE) +
facet_wrap(~coef) +
geom_text_repel(
aes(label = gene_name, colour = DE),
data = . %>% dplyr::filter(DE)
) +
labs(
x = "GC length",
y = "Ranking Statistic"
) +
coord_cartesian(ylim = c(-10, 10)) +
scale_x_log10(labels = comma) +
scale_colour_manual(values = deCols) +
theme(legend.position = "none")
Gene length bias for differential expression. Gene length is shown against the ranking statistic, calculated by -log10(p) multiple by the sign of logFC. No bias is observed for either comparison.
Conditional Quantile Normalisation (CQN) is a procedure that can account for GC content and length biases in differential expression and is implemented with package cqn. It does this by making by calculating offset values that can be incorporated into the GLM. As GC content was noted as being of concern, but length showed no bias, only GC content was used to calculate the offsets. The glm.offset values from the output of cqn were added to the original DGEList object.
cqn <- cqn(
counts = dgeList$counts,
x = dgeList$genes$gc_content,
lengths = rep(1000, times = nrow(dgeList)),
sizeFactors = dgeList$samples$lib.size,
lengthMethod = "fixed"
)
cqnplot(cqn, n = 1, xlab = "GC Content", col = genoCols)
legend(
"bottomright",
legend = levels(metadata$genotype),
col = genoCols,
lty = 1
)
Model fits for GC content under the CQN model.
dgeList$offset <- cqn$glm.offset
cpm_cqn <- cqn %>%
with(y + offset)
pca_cqn <- cpm_cqn %>%
t() %>%
prcomp()
pcaVars <- percent_format(0.1)(summary(pca_cqn)$importance["Proportion of Variance",])
pca_cqn$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
left_join(metadata) %>%
as_tibble() %>%
ggplot(aes(PC1, PC2, colour = genotype, fill = genotype, label = alias)) +
geom_point(size = 2) +
geom_text_repel(show.legend = FALSE) +
stat_ellipse(geom = "polygon", alpha = 0.05, show.legend = FALSE) +
guides(fill = FALSE) +
scale_colour_manual(values = genoCols) +
labs(
x = paste0("PC1 (", pcaVars[["PC1"]], ")"),
y = paste0("PC2 (", pcaVars[["PC2"]], ")"),
colour = "Genotype"
)
Principal Component Analysis of gene-level counts post CQN.
There was very little improvement in separation of genotype groups after CQN, and PC1 still only accounted for 11.4% of the variation. The x-axis of the PCA plot appears to be flipped, but this normal as the signs of PC values are somewhat arbitrary.
After incorporation of the offset values into the DGEList object, dispersion estimates were calculated and models were fitted as done pre CQN.
fit_cqn <- estimateDisp(dgeList, design) %>%
glmFit()
topTables_cqn <- colnames(design)[2:3] %>%
sapply(function(x){
glmLRT(fit_cqn, coef = x) %>%
topTags(n = Inf) %>%
.[["table"]] %>%
as_tibble() %>%
arrange(PValue) %>%
dplyr::select(
gene_id, gene_name, logFC, logCPM, PValue, FDR, everything()
) %>%
mutate(
coef = x,
bonfP = p.adjust(PValue, "bonf"),
DE = ifelse(FDR < 0.05, TRUE, FALSE)
)
}, simplify = FALSE)
topTables_cqn$`EOfAD-like/+` %>%
dplyr::filter(DE) %>%
dplyr::select(gene_id, gene_name, logFC, logCPM, PValue, FDR, bonfP) %>%
mutate(
logCPM = formatC(logCPM, digits = 2, format = "f"),
logFC = formatC(logFC, digits = 2, format = "f"),
PValue = formatC(PValue, digits = 2, format = "e"),
FDR = formatC(FDR, digits = 2, format = "e"),
bonfP = formatC(bonfP, digits = 2, format = "e"),
) %>%
kable(
align = "l",
caption = paste(
"The", nrow(.), "differentially expressed genes for EOfAD genotype",
"in comparison to wildtype after CQN"
)
) %>%
kable_styling(
bootstrap_options = c("striped", "hover", "condensed", "responsive")
)
| gene_id | gene_name | logFC | logCPM | PValue | FDR | bonfP |
|---|---|---|---|---|---|---|
| ENSDARG00000078322 | col12a1a | 1.87 | 3.83 | 9.45e-21 | 2.00e-16 | 2.00e-16 |
| ENSDARG00000093378 | si:ch211-235i11.5 | 0.62 | 2.96 | 4.52e-11 | 4.78e-07 | 9.55e-07 |
| ENSDARG00000020448 | adi1 | 0.52 | 2.81 | 2.88e-09 | 2.03e-05 | 6.09e-05 |
| ENSDARG00000079324 | ano9b | 1.35 | -1.19 | 6.58e-09 | 3.48e-05 | 1.39e-04 |
| ENSDARG00000001993 | myhb | 4.22 | 2.38 | 1.47e-06 | 6.23e-03 | 3.11e-02 |
| ENSDARG00000092191 | CR318588.1 | 3.21 | 2.45 | 2.25e-06 | 7.93e-03 | 4.76e-02 |
| ENSDARG00000094719 | CR318588.3 | 2.01 | 2.71 | 3.33e-06 | 1.01e-02 | 7.04e-02 |
| ENSDARG00000088298 | si:ch211-235i11.4 | 0.33 | 3.57 | 6.47e-06 | 1.71e-02 | 1.37e-01 |
topTables_cqn$`fAI-like/+` %>%
dplyr::filter(DE) %>%
dplyr::select(gene_id, gene_name, logFC, logCPM, PValue, FDR, bonfP) %>%
mutate(
logCPM = formatC(logCPM, digits = 2, format = "f"),
logFC = formatC(logFC, digits = 2, format = "f"),
PValue = formatC(PValue, digits = 2, format = "e"),
FDR = formatC(FDR, digits = 2, format = "e"),
bonfP = formatC(bonfP, digits = 2, format = "e"),
) %>%
kable(
align = "l",
caption = paste(
"The", nrow(.), "differentially expressed genes for fAI genotype",
"in comparison to wildtype after CQN"
)
) %>%
kable_styling(
bootstrap_options = c("striped", "hover", "condensed", "responsive")
)
| gene_id | gene_name | logFC | logCPM | PValue | FDR | bonfP |
|---|---|---|---|---|---|---|
| ENSDARG00000004870 | psen1 | -0.81 | 5.33 | 8.89e-52 | 1.88e-47 | 1.88e-47 |
| ENSDARG00000061758 | sh3pxd2ab | -3.30 | 1.06 | 2.20e-09 | 2.32e-05 | 4.65e-05 |
| ENSDARG00000005179 | dglucy | -1.01 | 5.01 | 6.88e-09 | 4.85e-05 | 1.45e-04 |
| ENSDARG00000067665 | fam167aa | -3.62 | 0.33 | 1.23e-08 | 6.52e-05 | 2.61e-04 |
| ENSDARG00000105453 | kcnk17 | -1.66 | -1.00 | 9.78e-08 | 4.14e-04 | 2.07e-03 |
| ENSDARG00000101775 | zgc:172014 | -2.06 | -0.43 | 1.49e-06 | 5.25e-03 | 3.15e-02 |
| ENSDARG00000079992 | senp6b | 1.79 | -0.96 | 3.21e-06 | 9.70e-03 | 6.79e-02 |
| ENSDARG00000104031 | mrpl33 | -0.26 | 4.08 | 5.41e-06 | 1.43e-02 | 1.14e-01 |
| ENSDARG00000053240 | kif6 | -0.89 | 0.78 | 1.11e-05 | 2.60e-02 | 2.34e-01 |
| ENSDARG00000092522 | nnt2 | -0.77 | 0.29 | 1.36e-05 | 2.88e-02 | 2.88e-01 |
| ENSDARG00000081758 | NC_002333.12 | -1.10 | 0.95 | 2.09e-05 | 3.78e-02 | 4.43e-01 |
| ENSDARG00000037613 | lgals8b | 0.36 | 4.43 | 2.15e-05 | 3.78e-02 | 4.54e-01 |
| ENSDARG00000100743 | si:dkey-190j3.4 | 1.08 | -0.52 | 2.84e-05 | 4.62e-02 | 6.01e-01 |
topTables_cqn %>%
bind_rows() %>%
arrange(DE) %>%
ggplot(aes(logCPM, logFC)) +
geom_point(aes(colour = DE), alpha = 0.4) +
geom_text_repel(
aes(label = gene_name, colour = DE),
data = . %>% dplyr::filter(DE)
) +
geom_smooth(se = FALSE) +
facet_wrap(~coef, nrow = 2) +
scale_y_continuous(breaks = seq(-8, 8, by = 2)) +
scale_colour_manual(values = deCols) +
theme(legend.position = "none")
MA plots checking for logFC artefacts across the range of expression values. The average logFC remains consistant across all expression values for both comparisons after CQN
topTables_cqn %>%
bind_rows() %>%
mutate(stat = -sign(logFC)*log10(PValue)) %>%
ggplot(aes(gc_content, stat)) +
geom_point(aes(colour = DE), alpha = 0.4) +
geom_smooth(se = FALSE) +
facet_wrap(~coef) +
geom_text_repel(
aes(label = gene_name, colour = DE),
data = . %>% dplyr::filter(DE)
) +
labs(
x = "Gene GC content (%)",
y = "Ranking Statistic"
) +
coord_cartesian(ylim = c(-10, 10)) +
scale_colour_manual(values = deCols) +
theme(legend.position = "none")
GC content bias for differential expression. GC content is shown against the ranking statistic, calculated by -log10(p) multiple by the sign of logFC. There was still a small amount of residual bias post CQN, but an improvement was noted, particulary for the fAI genotype.
topTables_cqn %>%
bind_rows() %>%
ggplot(aes(logFC, -log10(PValue), colour = DE)) +
geom_point(alpha = 0.4) +
geom_text_repel(
aes(label = gene_name),
size = 3,
data = . %>% dplyr::filter(DE)
) +
geom_text_repel(
aes(label = gene_name),
size = 3,
data = . %>% dplyr::filter(!DE, logFC < -3)
) +
facet_wrap(~coef, nrow = 1) +
scale_colour_manual(values = deCols) +
scale_x_continuous(breaks = seq(-8, 8, by = 2)) +
coord_cartesian(ylim = c(0, 10)) +
labs(y = log10plab) +
theme(legend.position = "none")
Volcano plots showing -log10(p) against logFC. Genes determined to be DE are coloured red.
geneLabeller <- as_labeller(
structure(dgeList$genes$gene_name, names = dgeList$genes$gene_id)
)
fadComp <- metadata %>%
dplyr::filter(str_detect(alias, "(fAD|wt)")) %>%
pull(sample)
faiComp <- metadata %>%
dplyr::filter(str_detect(alias, "(fAI|wt)")) %>%
pull(sample)
deGenes <- lapply(topTables_cqn, function(x){
dplyr::filter(x, DE) %>%
pull(gene_id)
})
deCpmPlots <- lapply(seq_along(deGenes), function(x){
cpms <- cpm_cqn[deGenes[[x]],] %>%
as.data.frame() %>%
rownames_to_column("gene_id") %>%
gather(key = "sample", value = logCPM, -gene_id) %>%
as_tibble() %>%
left_join(metadata[,c("sample", "genotype")])
if (names(deGenes[x]) == "EOfAD-like/+") {
dplyr::filter(cpms, sample %in% fadComp) %>%
ggplot(aes(genotype, logCPM)) +
geom_boxplot(aes(fill = genotype)) +
labs(title = paste(names(deGenes[x]), "DE genes")) +
facet_wrap(
~gene_id,
labeller = geneLabeller,
nrow = 2
) +
theme(
legend.position = "none",
title = element_text(face = "bold")
)
} else if (names(deGenes[x]) == "fAI-like/+") {
dplyr::filter(cpms, sample %in% faiComp) %>%
ggplot(aes(genotype, logCPM)) +
geom_boxplot(aes(fill = genotype)) +
labs(title = paste(names(deGenes[x]), "DE genes")) +
facet_wrap(
~gene_id,
labeller = geneLabeller,
nrow = 4
) +
theme(
legend.position = "none",
title = element_text(face = "bold")
)
}
})
A set of 8 were classified as DE due to the EOfAD-like/+ genotype. The highest ranked genes by p-value are shown in the following table. Interestingly, 4 of these genes were located on chromosome 17, the same chromosome as the mutation.
topTables_cqn$`EOfAD-like/+` %>%
dplyr::slice(1:1000) %>%
dplyr::select(gene_id, gene_name, logFC, logCPM, PValue, FDR, bonfP, DE) %>%
mutate(
logCPM = formatC(logCPM, digits = 2, format = "f"),
logFC = formatC(logFC, digits = 2, format = "f"),
PValue = formatC(PValue, digits = 2, format = "e"),
FDR = formatC(FDR, digits = 2, format = "e"),
bonfP = formatC(bonfP, digits = 2, format = "e"),
) %>%
datatable(
caption = htmltools::tags$caption(htmltools::em(paste0(
"EOfAD-like/+ genotype: The top 1000 genes ranked by p-value from ",
"differential expression testing."
))),
filter = "top"
)
cpm_cqn[deGenes$`EOfAD-like/+`,] %>%
as.data.frame() %>%
dplyr::select(all_of(fadComp)) %>%
set_rownames(idToSym(rownames(.), genes)) %>%
pheatmap::pheatmap(
color = viridis_pal(option = "magma")(100),
legend_breaks = c(seq(-2, 8, by = 2), max(.)),
legend_labels = c(seq(-2, 8, by = 2), "logCPM\n"),
annotation_col = dgeList$samples %>%
dplyr::select(Genotype = genotype),
annotation_names_col = FALSE,
annotation_colors = list(Genotype = genoCols),
cutree_rows = 3,
angle_col = "315"
)
Heatmap of DE genes in the comparison between EOfAD-like/+ and WT genotypes. Values a logCPM after CQN.
deCpmPlots[[1]]
logCPM boxplots between comparative groups for genes classified as DE.
A set of 13 were classified as DE due to the fAI-like/+ genotype. The highest ranked genes by p-value are shown in the following table. The location of DE genes followed a similar trend to the Alzheimer’s mutant with 9 genes located on chromosome 17.
topTables_cqn$`fAI-like/+` %>%
dplyr::slice(1:1000) %>%
dplyr::select(gene_id, gene_name, logFC, logCPM, PValue, FDR, bonfP, DE) %>%
mutate(
logCPM = formatC(logCPM, digits = 2, format = "f"),
logFC = formatC(logFC, digits = 2, format = "f"),
PValue = formatC(PValue, digits = 2, format = "e"),
FDR = formatC(FDR, digits = 2, format = "e"),
bonfP = formatC(bonfP, digits = 2, format = "e"),
) %>%
datatable(
caption = htmltools::tags$caption(htmltools::em(paste0(
"fAI-like/+ genotype: The top 1000 genes ranked by p-value from ",
"differential expression testing."
))),
filter = "top"
)
cpm_cqn[deGenes$`fAI-like/+`,] %>%
as.data.frame() %>%
dplyr::select(all_of(faiComp)) %>%
set_rownames(idToSym(rownames(.), genes)) %>%
pheatmap::pheatmap(
color = viridis_pal(option = "magma")(100),
legend_breaks = c(seq(-2, 8, by = 2), max(.)),
legend_labels = c(seq(-2, 8, by = 2), "logCPM\n"),
annotation_col = dgeList$samples %>%
dplyr::select(Genotype = genotype),
annotation_names_col = FALSE,
annotation_colors = list(Genotype = genoCols),
cutree_rows = 2,
angle_col = "315",
)
Heatmap of DE genes in the comparison between fAI-like/+ and WT genotypes. Values a logCPM after CQN.
deCpmPlots[[2]]
logCPM boxplots between comparative groups for genes classified as DE.
Final results were exported as Rds files for further analysis.
saveRDS(topTables, here("files/topTables.Rds"))
saveRDS(topTables_cqn, here("files/topTables_cqn.Rds"))
sessionInfo() %>%
pander()
R version 4.1.0 (2021-05-18)
Platform: x86_64-pc-linux-gnu (64-bit)
locale: LC_CTYPE=en_AU.UTF-8, LC_NUMERIC=C, LC_TIME=en_AU.UTF-8, LC_COLLATE=en_AU.UTF-8, LC_MONETARY=en_AU.UTF-8, LC_MESSAGES=en_AU.UTF-8, LC_PAPER=en_AU.UTF-8, LC_NAME=C, LC_ADDRESS=C, LC_TELEPHONE=C, LC_MEASUREMENT=en_AU.UTF-8 and LC_IDENTIFICATION=C
attached base packages: stats4, splines, parallel, stats, graphics, grDevices, utils, datasets, methods and base
other attached packages: ensembldb(v.2.16.0), AnnotationFilter(v.1.16.0), GenomicFeatures(v.1.44.0), AnnotationDbi(v.1.54.0), Biobase(v.2.52.0), GenomicRanges(v.1.44.0), GenomeInfoDb(v.1.28.0), IRanges(v.2.26.0), S4Vectors(v.0.30.0), pander(v.0.6.3), htmltools(v.0.5.1.1), DT(v.0.18), ggpubr(v.0.4.0), cqn(v.1.38.0), quantreg(v.5.85), SparseM(v.1.81), preprocessCore(v.1.54.0), nor1mix(v.1.3-0), mclust(v.5.4.7), pheatmap(v.1.0.12), RColorBrewer(v.1.1-2), fgsea(v.1.18.0), ggrepel(v.0.9.1), edgeR(v.3.34.0), limma(v.3.48.0), kableExtra(v.1.3.4), scales(v.1.1.1), AnnotationHub(v.3.0.0), BiocFileCache(v.2.0.0), dbplyr(v.2.1.1), BiocGenerics(v.0.38.0), here(v.1.0.1), magrittr(v.2.0.1), forcats(v.0.5.1), stringr(v.1.4.0), dplyr(v.1.0.6), purrr(v.0.3.4), readr(v.1.4.0), tidyr(v.1.1.3), tibble(v.3.1.2), ggplot2(v.3.3.3) and tidyverse(v.1.3.1)
loaded via a namespace (and not attached): utf8(v.1.2.1), tidyselect(v.1.1.1), RSQLite(v.2.2.7), htmlwidgets(v.1.5.3), grid(v.4.1.0), BiocParallel(v.1.26.0), munsell(v.0.5.0), withr(v.2.4.2), colorspace(v.2.0-1), filelock(v.1.0.2), highr(v.0.9), knitr(v.1.33), rstudioapi(v.0.13), ggsignif(v.0.6.1), MatrixGenerics(v.1.4.0), labeling(v.0.4.2), GenomeInfoDbData(v.1.2.6), bit64(v.4.0.5), farver(v.2.1.0), rprojroot(v.2.0.2), vctrs(v.0.3.8), generics(v.0.1.0), xfun(v.0.23), R6(v.2.5.0), locfit(v.1.5-9.4), bitops(v.1.0-7), cachem(v.1.0.5), DelayedArray(v.0.18.0), assertthat(v.0.2.1), promises(v.1.2.0.1), BiocIO(v.1.2.0), gtable(v.0.3.0), conquer(v.1.0.2), rlang(v.0.4.11), MatrixModels(v.0.5-0), systemfonts(v.1.0.2), rtracklayer(v.1.52.0), rstatix(v.0.7.0), lazyeval(v.0.2.2), broom(v.0.7.6), BiocManager(v.1.30.15), yaml(v.2.2.1), abind(v.1.4-5), modelr(v.0.1.8), crosstalk(v.1.1.1), backports(v.1.2.1), httpuv(v.1.6.1), tools(v.4.1.0), ellipsis(v.0.3.2), jquerylib(v.0.1.4), Rcpp(v.1.0.6), progress(v.1.2.2), zlibbioc(v.1.38.0), RCurl(v.1.98-1.3), prettyunits(v.1.1.1), SummarizedExperiment(v.1.22.0), haven(v.2.4.1), fs(v.1.5.0), data.table(v.1.14.0), openxlsx(v.4.2.3), reprex(v.2.0.0), ProtGenerics(v.1.24.0), matrixStats(v.0.58.0), hms(v.1.1.0), mime(v.0.10), evaluate(v.0.14), xtable(v.1.8-4), XML(v.3.99-0.6), rio(v.0.5.26), readxl(v.1.3.1), gridExtra(v.2.3), compiler(v.4.1.0), biomaRt(v.2.48.0), crayon(v.1.4.1), mgcv(v.1.8-36), later(v.1.2.0), lubridate(v.1.7.10), DBI(v.1.1.1), MASS(v.7.3-54), rappdirs(v.0.3.3), Matrix(v.1.3-4), car(v.3.0-10), cli(v.2.5.0), pkgconfig(v.2.0.3), GenomicAlignments(v.1.28.0), foreign(v.0.8-81), xml2(v.1.3.2), svglite(v.2.0.0), bslib(v.0.2.5.1), webshot(v.0.5.2), XVector(v.0.32.0), rvest(v.1.0.0), digest(v.0.6.27), Biostrings(v.2.60.0), rmarkdown(v.2.8), cellranger(v.1.1.0), fastmatch(v.1.1-0), restfulr(v.0.0.13), curl(v.4.3.1), shiny(v.1.6.0), Rsamtools(v.2.8.0), rjson(v.0.2.20), lifecycle(v.1.0.0), nlme(v.3.1-152), jsonlite(v.1.7.2), carData(v.3.0-4), viridisLite(v.0.4.0), fansi(v.0.5.0), pillar(v.1.6.1), lattice(v.0.20-44), KEGGREST(v.1.32.0), fastmap(v.1.1.0), httr(v.1.4.2), interactiveDisplayBase(v.1.30.0), glue(v.1.4.2), zip(v.2.2.0), png(v.0.1-7), BiocVersion(v.3.13.1), bit(v.4.0.4), stringi(v.1.6.2), sass(v.0.4.0), blob(v.1.2.1) and memoise(v.2.0.0)